knitr::opts_chunk$set(
echo = TRUE,
error = FALSE,
comment = "#>",
fig.path = "img/",
fig.retina = 2,
fig.width = 10,
fig.asp = 3/4,
fig.height = 8,
fig.pos = "t",
fig.align = "center",
dpi = 150,
out.width = "90%",
dev.args = list(png = list(type = "cairo-png")),
optipng = "-o1 -quiet"
)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.2 ✔ readr 2.1.4
#> ✔ forcats 1.0.0 ✔ stringr 1.5.0
#> ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
#> ✔ purrr 1.0.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(scorecard)
#>
#> Attaching package: 'scorecard'
#>
#> The following object is masked from 'package:tidyr':
#>
#> replace_na
library(glmnet)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#>
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Loaded glmnet 4.1-8
library(ggplot2)
library(plotly)
#>
#> Attaching package: 'plotly'
#>
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#>
#> The following object is masked from 'package:stats':
#>
#> filter
#>
#> The following object is masked from 'package:graphics':
#>
#> layout
library(dplyr)
library(knitr)
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#>
#> The following object is masked from 'package:dplyr':
#>
#> combine
library(grid)
library(cluster)
library(factoextra)
library(modeest)
#> Registered S3 method overwritten by 'rmutil':
#> method from
#> print.response httr
# Load the data
train_data <- read.csv("C:/Users/lenovo/Downloads/train.csv")
# Preview the data
str(train_data)
#> 'data.frame': 81738 obs. of 21 variables:
#> $ loan_amnt : int 5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#> $ funded_amnt : int 5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#> $ pymnt_plan : chr "n" "n" "n" "n" ...
#> $ grade : chr "B" "C" "A" "E" ...
#> $ sub_grade_num : num 0.4 1 0.8 0.2 0.4 1 0.6 0.2 0.2 0.4 ...
#> $ short_emp : int 0 0 0 0 0 1 0 1 0 0 ...
#> $ emp_length_num : int 11 11 4 10 5 1 6 1 4 4 ...
#> $ home_ownership : chr "RENT" "RENT" "RENT" "RENT" ...
#> $ dti : num 27.65 8.72 11.2 5.35 5.55 ...
#> $ purpose : chr "credit_card" "small_business" "wedding" "car" ...
#> $ payment_inc_ratio : num 8.14 8.26 5.22 2.74 4.57 ...
#> $ delinq_2yrs : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ delinq_2yrs_zero : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ inq_last_6mths : int 1 2 3 2 2 0 2 1 2 2 ...
#> $ last_delinq_none : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ last_major_derog_none: int 1 1 1 1 1 1 1 1 1 1 ...
#> $ open_acc : int 3 2 9 4 11 2 14 4 11 14 ...
#> $ pub_rec : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ pub_rec_zero : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ revol_util : num 83.7 98.5 28.3 87.5 32.6 36.5 20.6 91.7 43.1 55.5 ...
#> $ bad_loans : int 0 0 0 0 1 1 0 1 0 1 ...
purpose_counts <- table(train_data$purpose)
print(purpose_counts)
#>
#> car credit_card debt_consolidation home_improvement
#> 1570 14722 45428 4990
#> house major_purchase medical moving
#> 665 2580 1085 792
#> other small_business vacation wedding
#> 6107 2173 587 1039
home_ownership_counts <- table(train_data$home_ownership)
print(home_ownership_counts)
#>
#> MORTGAGE OTHER OWN RENT
#> 39583 116 6639 35400
grade_counts <- table(train_data$grade)
print(grade_counts)
#>
#> A B C D E F G
#> 14812 24775 19928 12847 6022 2611 743
# Check for missing values and duplicates
sum(is.na(train_data))
#> [1] 97
# Convert categorical data to numeric
train_data <- train_data %>%
mutate(
grade = as.numeric(factor(grade), levels = c("A", "B", "C", "D", "E", "F" , "G")),
purpose = as.numeric(factor(purpose), levels = c("car", "credit_card", "debt_consolidation", "home_improvement", "house", "major_purchase","medical","moving", "other", "small_business", "vacation", "wedding")),
home_ownership = as.numeric(factor(home_ownership), levels = c("MORTGAGE", "OTHER", "OWN", "RENT"))
)
# Remove unnecessary columns
train_data <- dplyr::select(train_data, -pymnt_plan)
# Check for missing values and handle them
na_counts <- sapply(train_data, function(x) sum(is.na(x)))
print(na_counts)
#> loan_amnt funded_amnt grade
#> 0 0 0
#> sub_grade_num short_emp emp_length_num
#> 0 0 0
#> home_ownership dti purpose
#> 0 0 0
#> payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1 16 16
#> inq_last_6mths last_delinq_none last_major_derog_none
#> 16 0 0
#> open_acc pub_rec pub_rec_zero
#> 16 16 16
#> revol_util bad_loans
#> 0 0
set.seed(1)
total_rows <- nrow(train_data)
subset_size <- total_rows / 2
random_subset <- train_data %>%
sample_n(subset_size)
cat("Total rows:", total_rows, "\n")
#> Total rows: 81738
cat("Subset size:", subset_size, "\n")
#> Subset size: 40869
print(head(random_subset))
#> loan_amnt funded_amnt grade sub_grade_num short_emp emp_length_num
#> 1 12000 12000 3 1.0 0 4
#> 2 18550 18550 4 0.4 0 5
#> 3 10000 10000 4 0.6 0 5
#> 4 9600 9600 1 0.4 1 0
#> 5 12000 12000 1 0.4 0 11
#> 6 6000 6000 5 0.4 0 6
#> home_ownership dti purpose payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1 4 9.14 3 11.76200 0 1
#> 2 4 18.64 7 8.77438 0 1
#> 3 1 10.87 2 6.54269 0 1
#> 4 4 19.14 3 5.89520 1 0
#> 5 1 4.20 10 6.23880 0 1
#> 6 4 19.92 3 4.27303 0 1
#> inq_last_6mths last_delinq_none last_major_derog_none open_acc pub_rec
#> 1 5 0 1 15 1
#> 2 1 1 1 4 0
#> 3 3 1 1 8 0
#> 4 1 0 1 13 0
#> 5 0 1 1 14 0
#> 6 6 0 1 16 0
#> pub_rec_zero revol_util bad_loans
#> 1 0 63.5 0
#> 2 1 47.1 0
#> 3 1 50.8 0
#> 4 1 23.7 0
#> 5 1 6.0 0
#> 6 1 45.1 0
# dataframe
train_data1 <- random_subset
iv = iv(train_data1, y = 'bad_loans') %>%
as_tibble() %>%
mutate( info_value = round(info_value, 3) ) %>%
arrange( desc(info_value) )
iv %>%
knitr::kable()
| variable | info_value |
|---|---|
| dti | 0.485 |
| grade | 0.329 |
| revol_util | 0.252 |
| loan_amnt | 0.236 |
| funded_amnt | 0.234 |
| payment_inc_ratio | 0.081 |
| purpose | 0.042 |
| inq_last_6mths | 0.041 |
| home_ownership | 0.017 |
| emp_length_num | 0.010 |
| open_acc | 0.008 |
| short_emp | 0.004 |
| delinq_2yrs | 0.003 |
| pub_rec | 0.001 |
| delinq_2yrs_zero | 0.001 |
| sub_grade_num | 0.001 |
| last_delinq_none | 0.000 |
| last_major_derog_none | 0.000 |
| pub_rec_zero | 0.000 |
bins = woebin(train_data1, y = 'bad_loans')
#> ℹ Creating woe binning ...
#> ✔ Binning on 40869 rows and 20 columns in 00:00:12
####plot-bins
variables <- names(bins[])
for (var in variables) {
# Print the table for each variable
bins[[var]] %>%
knitr::kable()
# Plot with specified colors
plot <- woebin_plot(bins[[var]], line_color = 'grey4', bar_color = c('steelblue3', 'sandybrown'), show_barval = FALSE)
# Print the plot
print(plot)
}
#> $loan_amnt
#>
#> $funded_amnt
#>
#> $grade
#>
#> $sub_grade_num
#>
#> $short_emp
#>
#> $emp_length_num
#>
#> $home_ownership
#>
#> $dti
#>
#> $purpose
#>
#> $payment_inc_ratio
#>
#> $delinq_2yrs
#>
#> $delinq_2yrs_zero
#>
#> $inq_last_6mths
#>
#> $last_delinq_none
#>
#> $last_major_derog_none
#>
#> $open_acc
#>
#> $pub_rec
#>
#> $pub_rec_zero
#>
#> $revol_util
plots <- list()
for (var in variables[c(1:7, 10, 12)]) {
if (var %in% names(bins)) {
plot <- woebin_plot(bins[[var]], line_color = 'grey4', bar_color = c('steelblue3', 'sandybrown'), show_barval = FALSE)
plots[[var]] <- plot[[1]]
} else {
cat("\n### Variable:", var, "not found in bins\n")
}
}
do.call(grid.arrange, c(plots, ncol = 3))
# Define a function to add percentage columns
add_percentage_columns <- function(bin_data) {
bin_data %>%
mutate(
percentage_pos = (pos / count) * 100,
percentage_neg = (neg / count) * 100
)
}
# Apply the function to each bin in the list
for (var in variables) {
if (var %in% names(bins)) {
bins[[var]] <- add_percentage_columns(bins[[var]])
cat("\n### Added percentage columns to variable:", var, "\n")
} else {
cat("\n### Variable:", var, "not found in bins\n")
}
}
#>
#> ### Added percentage columns to variable: loan_amnt
#>
#> ### Added percentage columns to variable: funded_amnt
#>
#> ### Added percentage columns to variable: grade
#>
#> ### Added percentage columns to variable: sub_grade_num
#>
#> ### Added percentage columns to variable: short_emp
#>
#> ### Added percentage columns to variable: emp_length_num
#>
#> ### Added percentage columns to variable: home_ownership
#>
#> ### Added percentage columns to variable: dti
#>
#> ### Added percentage columns to variable: purpose
#>
#> ### Added percentage columns to variable: payment_inc_ratio
#>
#> ### Added percentage columns to variable: delinq_2yrs
#>
#> ### Added percentage columns to variable: delinq_2yrs_zero
#>
#> ### Added percentage columns to variable: inq_last_6mths
#>
#> ### Added percentage columns to variable: last_delinq_none
#>
#> ### Added percentage columns to variable: last_major_derog_none
#>
#> ### Added percentage columns to variable: open_acc
#>
#> ### Added percentage columns to variable: pub_rec
#>
#> ### Added percentage columns to variable: pub_rec_zero
#>
#> ### Added percentage columns to variable: revol_util
######################################
for (var in variables) {
if (var %in% names(bins)) {
table <- bins[[var]] %>%
dplyr::select(variable, bin, woe, count, percentage_pos, percentage_neg) %>%
dplyr::rename(
Variable = variable,
Bin = bin,
WOE = woe,
Count = count,
`Positive Rate (%)` = percentage_pos,
`Negative Rate (%)` = percentage_neg
)
cat("\n### Variable:", var, "\n")
print(knitr::kable(table, format = "pipe", digits = 2))
} else {
cat("\n### Variable:", var, "not found in bins\n")
}
}
#>
#> ### Variable: loan_amnt
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------|:------------|-----:|-----:|-----------------:|-----------------:|
#> |loan_amnt |[-Inf,8000) | -0.18| 12535| 16.51| 83.49|
#> |loan_amnt |[8000,15500) | -0.07| 15915| 18.09| 81.91|
#> |loan_amnt |[15500, Inf) | 0.24| 12419| 23.05| 76.95|
#>
#> ### Variable: funded_amnt
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------|:------------|-----:|-----:|-----------------:|-----------------:|
#> |funded_amnt |[-Inf,8000) | -0.17| 12618| 16.56| 83.44|
#> |funded_amnt |[8000,15500) | -0.07| 15990| 18.11| 81.89|
#> |funded_amnt |[15500, Inf) | 0.24| 12261| 23.04| 76.96|
#>
#> ### Variable: grade
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |grade |[-Inf,2) | -1.12| 7309| 7.14| 92.86|
#> |grade |[2,3) | -0.32| 12370| 14.66| 85.34|
#> |grade |[3,4) | 0.10| 10069| 20.77| 79.23|
#> |grade |[4,5) | 0.42| 6414| 26.47| 73.53|
#> |grade |[5, Inf) | 0.86| 4707| 35.84| 64.16|
#>
#> ### Variable: sub_grade_num
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-------------|:----------|-----:|-----:|-----------------:|-----------------:|
#> |sub_grade_num |[-Inf,0.4) | -0.05| 7966| 18.40| 81.60|
#> |sub_grade_num |[0.4,0.6) | -0.01| 8283| 18.97| 81.03|
#> |sub_grade_num |[0.6,0.8) | 0.00| 8399| 19.17| 80.83|
#> |sub_grade_num |[0.8, Inf) | 0.03| 16221| 19.51| 80.49|
#>
#> ### Variable: short_emp
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |short_emp |[-Inf,1) | -0.02| 35773| 18.74| 81.26|
#> |short_emp |[1, Inf) | 0.16| 5096| 21.74| 78.26|
#>
#> ### Variable: emp_length_num
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |emp_length_num |[-Inf,2) | 0.16| 5096| 21.74| 78.26|
#> |emp_length_num |[2,3) | 0.03| 2811| 19.53| 80.47|
#> |emp_length_num |[3, Inf) | -0.03| 32962| 18.67| 81.33|
#>
#> ### Variable: home_ownership
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |home_ownership |[-Inf,2) | -0.14| 19843| 17.08| 82.92|
#> |home_ownership |[2,4) | 0.08| 3373| 20.43| 79.57|
#> |home_ownership |[4, Inf) | 0.13| 17653| 21.15| 78.85|
#>
#> ### Variable: dti
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:---------|-----:|-----:|-----------------:|-----------------:|
#> |dti |[-Inf,7) | -0.37| 5650| 14.07| 85.93|
#> |dti |[7,17) | -0.14| 18037| 17.03| 82.97|
#> |dti |[17,26) | 0.14| 13550| 21.37| 78.63|
#> |dti |[26, Inf) | 0.54| 3632| 28.91| 71.09|
#>
#> ### Variable: purpose
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |purpose |[-Inf,3) | -0.20| 8128| 16.25| 83.75|
#> |purpose |[3,4) | 0.03| 22724| 19.54| 80.46|
#> |purpose |[4,6) | -0.16| 2782| 16.75| 83.25|
#> |purpose |[6,9) | -0.20| 2270| 16.26| 83.74|
#> |purpose |[9, Inf) | 0.32| 4965| 24.47| 75.53|
#>
#> ### Variable: payment_inc_ratio
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------------|:----------|-----:|-----:|-----------------:|-----------------:|
#> |payment_inc_ratio |[-Inf,6.5) | -0.35| 18679| 14.21| 85.79|
#> |payment_inc_ratio |[6.5,10) | -0.03| 11456| 18.60| 81.40|
#> |payment_inc_ratio |[10,12) | 0.32| 4357| 24.51| 75.49|
#> |payment_inc_ratio |[12, Inf) | 0.63| 6377| 30.69| 69.31|
#>
#> ### Variable: delinq_2yrs
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |delinq_2yrs |[-Inf,1)%,%missing | -0.01| 35079| 18.95| 81.05|
#> |delinq_2yrs |[1, Inf) | 0.06| 5790| 20.12| 79.88|
#>
#> ### Variable: delinq_2yrs_zero
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |delinq_2yrs_zero |[-Inf,1)%,%missing | 0.06| 5796| 20.10| 79.90|
#> |delinq_2yrs_zero |[1, Inf) | -0.01| 35073| 18.95| 81.05|
#>
#> ### Variable: inq_last_6mths
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |inq_last_6mths |[-Inf,1)%,%missing | -0.18| 18623| 16.42| 83.58|
#> |inq_last_6mths |[1,2) | 0.04| 11548| 19.67| 80.33|
#> |inq_last_6mths |[2,3) | 0.14| 6114| 21.43| 78.57|
#> |inq_last_6mths |[3, Inf) | 0.37| 4584| 25.57| 74.43|
#>
#> ### Variable: last_delinq_none
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |last_delinq_none |[-Inf,1) | 0.01| 16910| 19.32| 80.68|
#> |last_delinq_none |[1, Inf) | -0.01| 23959| 18.97| 81.03|
#>
#> ### Variable: last_major_derog_none
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |last_major_derog_none |[-Inf,1) | -0.03| 5181| 18.70| 81.30|
#> |last_major_derog_none |[1, Inf) | 0.00| 35688| 19.17| 80.83|
#>
#> ### Variable: open_acc
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |open_acc |[-Inf,5)%,%missing | 0.06| 2591| 19.99| 80.01|
#> |open_acc |[5,7) | -0.05| 5434| 18.42| 81.58|
#> |open_acc |[7,10) | 0.00| 11422| 19.06| 80.94|
#> |open_acc |[10,12) | 0.03| 6942| 19.56| 80.44|
#> |open_acc |[12,13) | 0.09| 2893| 20.57| 79.43|
#> |open_acc |[13,14) | -0.06| 2378| 18.17| 81.83|
#> |open_acc |[14,16) | 0.03| 3601| 19.58| 80.42|
#> |open_acc |[16, Inf) | -0.05| 5608| 18.28| 81.72|
#>
#> ### Variable: pub_rec
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |pub_rec |[-Inf,1)%,%missing | 0.00| 37093| 19.14| 80.86|
#> |pub_rec |[1, Inf) | -0.02| 3776| 18.80| 81.20|
#>
#> ### Variable: pub_rec_zero
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |pub_rec_zero |[-Inf,1)%,%missing | -0.02| 3782| 18.77| 81.23|
#> |pub_rec_zero |[1, Inf) | 0.00| 37087| 19.15| 80.85|
#>
#> ### Variable: revol_util
#>
#>
#> |Variable |Bin | WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------|:---------|-----:|-----:|-----------------:|-----------------:|
#> |revol_util |[-Inf,26) | -0.55| 6902| 12.03| 87.97|
#> |revol_util |[26,50) | -0.18| 10387| 16.45| 83.55|
#> |revol_util |[50,72) | 0.07| 12001| 20.21| 79.79|
#> |revol_util |[72,86) | 0.25| 6892| 23.35| 76.65|
#> |revol_util |[86, Inf) | 0.42| 4687| 26.41| 73.59|
results_list <- list()
for (var in variables) {
if (var %in% names(bins)) {
table <- bins[[var]] %>%
dplyr::select(variable, bin, woe, count, percentage_pos, percentage_neg, bin_iv) %>%
dplyr::rename(
Variable = variable,
Bin = bin,
WOE = woe,
Count = count,
iv = bin_iv,
`Positive Rate (%)` = percentage_pos,
`Negative Rate (%)` = percentage_neg
)
results_list[[var]] <- table
}
}
combined_results <- bind_rows(results_list, .id = "Variable")
train_woe <- woebin_ply(train_data1, bins)
#> ℹ Converting into woe values ...
#> ✔ Woe transformating on 40869 rows and 19 columns in 00:00:11
df_train <- train_woe
# Remove unnecessary columns
df1 <- dplyr::select(df_train, -bad_loans)
# Perform PCA
data_pca <- prcomp(df1, center = TRUE, scale. = F)
summary(data_pca)
#> Importance of components:
#> PC1 PC2 PC3 PC4 PC5 PC6 PC7
#> Standard deviation 0.6260 0.3721 0.2820 0.22881 0.20052 0.17533 0.15299
#> Proportion of Variance 0.5004 0.1768 0.1016 0.06685 0.05135 0.03926 0.02989
#> Cumulative Proportion 0.5004 0.6772 0.7788 0.84562 0.89696 0.93622 0.96610
#> PC8 PC9 PC10 PC11 PC12 PC13 PC14
#> Standard deviation 0.12092 0.08631 0.04382 0.03642 0.02566 0.01404 0.01132
#> Proportion of Variance 0.01867 0.00951 0.00245 0.00169 0.00084 0.00025 0.00016
#> Cumulative Proportion 0.98478 0.99429 0.99674 0.99843 0.99927 0.99953 0.99969
#> PC15 PC16 PC17 PC18 PC19
#> Standard deviation 0.009821 0.009364 0.007662 0.0006635 2.419e-16
#> Proportion of Variance 0.000120 0.000110 0.000070 0.0000000 0.000e+00
#> Cumulative Proportion 0.999810 0.999920 1.000000 1.0000000 1.000e+00
loadings <- data_pca$rotation
head(loadings)
#> PC1 PC2 PC3 PC4
#> loan_amnt_woe 0.073925136 -0.253820895 0.063586182 -0.198899745
#> funded_amnt_woe 0.073912131 -0.251889280 0.062201727 -0.195292233
#> grade_woe 0.945146341 0.221287928 0.199161724 0.035117355
#> sub_grade_num_woe -0.005563003 -0.005599729 -0.009461251 -0.002620156
#> short_emp_woe -0.002213074 -0.001496193 0.005141298 0.004143282
#> emp_length_num_woe -0.002408340 -0.001046150 0.006117357 0.004535543
#> PC5 PC6 PC7 PC8
#> loan_amnt_woe -0.596521308 0.083046755 -0.070652246 0.16337237
#> funded_amnt_woe -0.585355665 0.082238273 -0.068350612 0.16071107
#> grade_woe 0.032423462 0.105600793 0.057545178 -0.03161873
#> sub_grade_num_woe -0.004397023 -0.013970272 -0.011040741 0.01266304
#> short_emp_woe 0.044973686 -0.008747212 -0.003109089 0.06880736
#> emp_length_num_woe 0.049823527 -0.009151991 -0.003684869 0.07775236
#> PC9 PC10 PC11 PC12
#> loan_amnt_woe -0.024335682 0.0044949026 -0.003463462 -0.0037211962
#> funded_amnt_woe -0.024055461 0.0030525030 0.001655281 0.0140260871
#> grade_woe -0.004505497 -0.0008711838 -0.011588331 -0.0116795853
#> sub_grade_num_woe 0.001316874 0.0092330898 0.056617537 -0.9974973244
#> short_emp_woe -0.693738384 0.0058779791 0.001623484 0.0007030922
#> emp_length_num_woe -0.709286624 0.0054178875 0.003796025 -0.0005814363
#> PC13 PC14 PC15 PC16
#> loan_amnt_woe 0.6992147436 -0.049363944 -0.0101610460 0.002800952
#> funded_amnt_woe -0.7113296330 0.047644996 0.0111328205 0.001980115
#> grade_woe 0.0006441983 0.004242974 0.0001074442 -0.002173667
#> sub_grade_num_woe -0.0106377005 0.025865070 0.0001360339 -0.014421276
#> short_emp_woe -0.0152838291 -0.074032288 -0.6886503038 -0.178334445
#> emp_length_num_woe 0.0148283926 0.067794163 0.6727614217 0.175270894
#> PC17 PC18 PC19
#> loan_amnt_woe -0.001259353 4.512122e-05 2.014199e-15
#> funded_amnt_woe -0.001106719 -4.082085e-05 -7.996344e-16
#> grade_woe 0.001585032 8.722092e-06 9.584531e-17
#> sub_grade_num_woe 0.007808105 1.586855e-05 3.545324e-16
#> short_emp_woe -0.006202354 -3.354520e-04 4.690120e-16
#> emp_length_num_woe 0.005881672 6.461365e-05 -5.546858e-16
fviz_eig(data_pca, addlabels = TRUE)
# Check the PCA output
head(data_pca$x)
#> PC1 PC2 PC3 PC4 PC5 PC6
#> [1,] 0.2921316 -0.19089166 0.1057117 -0.05820084 0.219797540 -0.412702689
#> [2,] 0.5134660 -0.03691616 0.2125867 0.11715191 -0.304435505 0.095277648
#> [3,] 0.5207891 0.18552810 0.1064171 -0.05423267 0.007266799 -0.377334274
#> [4,] -1.1263020 0.07704632 0.1638929 0.32977103 -0.098309680 0.007099912
#> [5,] -1.1771336 0.15374169 0.3223907 -0.14675212 -0.013589018 0.164693871
#> [6,] 0.8368147 0.58757265 0.2756430 0.36246019 0.045630945 -0.191133919
#> PC7 PC8 PC9 PC10 PC11
#> [1,] -0.05091001 0.09821871 0.06706506 0.028341563 -0.005671512
#> [2,] 0.18346330 0.20878863 0.03659958 0.059627400 -0.022679101
#> [3,] 0.19399641 -0.10711691 0.02778540 -0.003411623 -0.020516487
#> [4,] -0.06360999 0.19311549 -0.21201255 -0.055867561 0.097186839
#> [5,] -0.31250876 -0.15023069 0.02451762 0.028734360 -0.016472905
#> [6,] 0.01093339 0.11298191 0.05369666 -0.053209611 -0.023127403
#> PC12 PC13 PC14 PC15 PC16
#> [1,] -0.0210751094 -0.0009599073 -0.012285314 -0.008755016 0.023993526
#> [2,] 0.0045274794 -0.0013699203 0.008656917 -0.003202198 -0.002783650
#> [3,] -0.0077617866 0.0004323956 0.009952554 -0.002231300 -0.004361317
#> [4,] 0.0209509095 -0.0005126908 0.003215737 -0.003733679 -0.001526565
#> [5,] 0.0111541083 -0.0014094692 0.004725085 -0.002400555 -0.001721166
#> [6,] -0.0002443128 -0.0013473640 -0.003498016 -0.001031571 -0.009234172
#> PC17 PC18 PC19
#> [1,] -0.017875967 -2.174590e-05 2.755393e-16
#> [2,] 0.003341090 1.209072e-05 3.543329e-16
#> [3,] 0.004376651 3.914620e-05 -5.839496e-17
#> [4,] -0.007516465 -6.584101e-05 -2.862365e-16
#> [5,] 0.004050195 -5.141151e-05 -1.828037e-16
#> [6,] -0.011053840 4.810600e-05 -1.036341e-16
eig.val<-get_eigenvalue(data_pca)
eig.val
#> eigenvalue variance.percent cumulative.variance.percent
#> Dim.1 3.918482e-01 5.003707e+01 50.03707
#> Dim.2 1.384765e-01 1.768276e+01 67.71983
#> Dim.3 7.953661e-02 1.015643e+01 77.87626
#> Dim.4 5.235333e-02 6.685260e+00 84.56152
#> Dim.5 4.020963e-02 5.134570e+00 89.69609
#> Dim.6 3.074142e-02 3.925526e+00 93.62162
#> Dim.7 2.340642e-02 2.988883e+00 96.61050
#> Dim.8 1.462112e-02 1.867045e+00 98.47754
#> Dim.9 7.449615e-03 9.512788e-01 99.42882
#> Dim.10 1.919864e-03 2.451571e-01 99.67398
#> Dim.11 1.326062e-03 1.693315e-01 99.84331
#> Dim.12 6.586397e-04 8.410501e-02 99.92742
#> Dim.13 1.970018e-04 2.515616e-02 99.95257
#> Dim.14 1.281299e-04 1.636155e-02 99.96893
#> Dim.15 9.645838e-05 1.231726e-02 99.98125
#> Dim.16 8.767756e-05 1.119599e-02 99.99245
#> Dim.17 5.870826e-05 7.496753e-03 99.99994
#> Dim.18 4.402401e-07 5.621647e-05 100.00000
#> Dim.19 5.852888e-32 7.473848e-30 100.00000
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x
var<-get_pca_var(data_pca)
a<-fviz_contrib(data_pca, "var", axes=1, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of first Principal Components")
b<-fviz_contrib(data_pca, "var", axes=2, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")
c<-fviz_contrib(data_pca, "var", axes=3, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")
d<-fviz_contrib(data_pca, "var", axes=4, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")
library("corrplot")
#> corrplot 0.92 loaded
corrplot(var$cos2, is.corr=FALSE)
fviz_pca_var(data_pca,
col.var = "cos2", # Color by the quality of representation
gradient.cols = c("darkorchid4", "gold", "darkorange"),
repel = TRUE
)
######################################
### Rename columns to f1, f2, f3, ...
df1_new <- df1
new_col_names <- paste0("f", seq_along(colnames(df1_new)))
names(df1_new) <- new_col_names
# Create a mapping table of original and new column names
original_col_names <- colnames(dplyr::select(train_woe, -bad_loans))
data_pca1 <- prcomp(df1_new, center = TRUE, scale. = F)
loadings <- data_pca1$rotation
# Create a mapping table for reference
mapping_table <- data.frame(
New_Name = new_col_names,
Original_Name = colnames(dplyr::select(train_woe, -bad_loans))
)
fviz_pca_var(data_pca1,
col.var = "cos2", # Color by the quality of representation
gradient.cols = c("darkorchid4", "gold", "darkorange"),
repel = TRUE
)
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x
set.seed(1)
wss2 <- function(k) {
kmeans(df1, centers = k, iter.max = 200, nstart = 100)$tot.withinss
}
k.values <- 1:10
wss_values2 <- sapply(k.values, wss2)
# Elbow method
plot(k.values, wss_values2, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')
# Perform k-means clustering with 4 cluster
km4 <- kmeans(df1, centers = 4, nstart =100)
fviz_cluster(km4, data = df1) +
scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E', 'lightpink4')) +
scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E', 'lightpink4')) +
ggtitle("Cluster for All Variables") +
theme_minimal()
set.seed(2)
wss1 <- function(k) {
kmeans(results, centers = k, iter.max = 100, nstart = 50)$tot.withinss
}
k.values <- 1:10
wss_values1 <- sapply(k.values, wss1)
wss_values1
#> [1] 24923.572 14400.046 10627.189 8140.346 6908.100 5934.548 5332.015
#> [8] 4739.680 4306.678 3945.009
# Plot the WSS values for each number of clusters
plot(k.values, wss_values1, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')
# Perform k-means clustering
set.seed(1)
km_res <- kmeans(results, centers = 3, nstart = 100)
fviz_cluster(km_res, data = results) +
scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
ggtitle("3 Cluster for result of pca") +
theme_minimal()
Customers_Segments <- data.frame(results, cluster = as.factor(km_res$cluster))
km_res$size
#> [1] 7309 19836 13724
km_res$centers
#> PC1 PC2 PC3
#> 1 -1.1658261 -0.08222403 -0.007465017
#> 2 -0.1858829 0.09897490 -0.011609765
#> 3 0.5382256 -0.04571766 0.024009623
df1$groupkm <- km_res$cluster
g1<- df1[df1$groupkm==1,]
g2<- df1[df1$groupkm==2,]
g3<- df1[df1$groupkm==2,]
### Group 1
blue_palette <- colorRampPalette(c("lightblue", "blue", "darkblue"))
colors <- blue_palette(20)
#funded_amnt
g1 %>%
ggplot(aes(x = funded_amnt_woe, fill = factor(funded_amnt_woe))) +
geom_bar(color = "grey20") + guides(fill = FALSE)+
geom_text(stat = "count", aes(label = ..count..),
vjust = -0.3, size = 3.5) +
labs(title="g1_funded_amnt_woe")+
scale_x_continuous(breaks = seq(min(g1$funded_amnt_woe), max(g1$funded_amnt_woe) , by = 0.095))+
scale_fill_manual(values = colors[1:length(unique(g1$loan_amnt_woe))])
#> Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
#> of ggplot2 3.3.4.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
#> ℹ Please use `after_stat(count)` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
min(g1$funded_amnt_woe)
#> [1] -0.1741437
max(g1$funded_amnt_woe)
#> [1] 0.2367068
mean(g1$funded_amnt_woe)
#> [1] -0.05075121
# Create summary table for each cluster
cluster_summary <- df1 %>%
group_by(groupkm) %>%
summarise(
count = n(),
loan_amnt_min = min(loan_amnt_woe),
loan_amnt_max = max(loan_amnt_woe),
loan_amnt_mean = mean(loan_amnt_woe),
funded_amnt_min = min(funded_amnt_woe),
funded_amnt_max = max(funded_amnt_woe),
funded_amnt_mean = mean(funded_amnt_woe),
sub_grade_num_min = min(sub_grade_num_woe),
sub_grade_num_max = max(sub_grade_num_woe),
emp_length_num_min = min(emp_length_num_woe),
emp_length_num_max = max(emp_length_num_woe),
dti_min = min(dti_woe),
dti_max = max(dti_woe),
dti_mean = mean(dti_woe),
revol_util_min = min(revol_util_woe),
revol_util_max = max(revol_util_woe),
revol_util_mean = mean(revol_util_woe),
)
# Display the summary
cluster_summary <- t(cluster_summary)
print(cluster_summary)
#> [,1] [,2] [,3]
#> groupkm 1.00000000 2.000000e+00 3.000000e+00
#> count 7309.00000000 1.983600e+04 1.372400e+04
#> loan_amnt_min -0.17833591 -1.783359e-01 -1.783359e-01
#> loan_amnt_max 0.23743139 2.374314e-01 2.374314e-01
#> loan_amnt_mean -0.05015715 -3.948019e-02 5.741985e-02
#> funded_amnt_min -0.17414371 -1.741437e-01 -1.741437e-01
#> funded_amnt_max 0.23670680 2.367068e-01 2.367068e-01
#> funded_amnt_mean -0.05075121 -3.863630e-02 5.722962e-02
#> sub_grade_num_min -0.04653398 -4.653398e-02 -4.653398e-02
#> sub_grade_num_max 0.02524326 2.524326e-02 2.524326e-02
#> emp_length_num_min -0.02886748 -2.886748e-02 -2.886748e-02
#> emp_length_num_max 0.16199733 1.619973e-01 1.619973e-01
#> dti_min -0.36669167 -3.666917e-01 -3.666917e-01
#> dti_max 0.54295647 5.429565e-01 5.429565e-01
#> dti_mean -0.08493578 -3.378141e-02 3.933095e-02
#> revol_util_min -0.54728702 -5.472870e-01 -5.472870e-01
#> revol_util_max 0.41814344 4.181434e-01 4.181434e-01
#> revol_util_mean -0.25603657 -3.597186e-02 1.061340e-01
cluster_summary_df <- as.data.frame(cluster_summary)
###table
#grade
grade_summary <- df1 %>%
group_by(groupkm, grade_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = grade_woe, values_from = count, values_fill = list(count = 0))
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
| groupkm | -1.12236611095412 | -0.319075617367842 | 0.103685264830283 | 0.421220703081032 | 0.860425570044054 |
|---|---|---|---|---|---|
| 1 | 7309 | 0 | 0 | 0 | 0 |
| 2 | 0 | 12370 | 7051 | 415 | 0 |
| 3 | 0 | 0 | 3018 | 5999 | 4707 |
#short_emp
short_emp_summary <- df1 %>%
group_by(groupkm, short_emp_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = short_emp_woe, values_from = count, values_fill = list(count = 0))
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
| groupkm | -0.0244209293279626 | 0.161997334092502 |
|---|---|---|
| 1 | 6294 | 1015 |
| 2 | 17387 | 2449 |
| 3 | 12092 | 1632 |
#purpose
purpose_summary <- df1 %>%
group_by(groupkm, purpose_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = purpose_woe, values_from = count, values_fill = list(count = 0))
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
| groupkm | -0.196831871058625 | -0.196608100580901 | -0.160680606700238 | 0.0273586138416367 | 0.315718834677194 |
|---|---|---|---|---|---|
| 1 | 1686 | 631 | 683 | 3427 | 882 |
| 2 | 4253 | 1029 | 1374 | 10980 | 2200 |
| 3 | 2189 | 610 | 725 | 8317 | 1883 |
#delinq_2yrs
delinq_2yrs_summary <- df1 %>%
group_by(groupkm, delinq_2yrs_woe) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = delinq_2yrs_woe, values_from = count, values_fill = list(count = 0))
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
| groupkm | -0.0108046746319754 | 0.0639753139202767 |
|---|---|---|
| 1 | 6860 | 449 |
| 2 | 16881 | 2955 |
| 3 | 11338 | 2386 |
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
| groupkm | -1.12236611095412 | -0.319075617367842 | 0.103685264830283 | 0.421220703081032 | 0.860425570044054 |
|---|---|---|---|---|---|
| 1 | 7309 | 0 | 0 | 0 | 0 |
| 2 | 0 | 12370 | 7051 | 415 | 0 |
| 3 | 0 | 0 | 3018 | 5999 | 4707 |
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
| groupkm | -0.0244209293279626 | 0.161997334092502 |
|---|---|---|
| 1 | 6294 | 1015 |
| 2 | 17387 | 2449 |
| 3 | 12092 | 1632 |
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
| groupkm | -0.196831871058625 | -0.196608100580901 | -0.160680606700238 | 0.0273586138416367 | 0.315718834677194 |
|---|---|---|---|---|---|
| 1 | 1686 | 631 | 683 | 3427 | 882 |
| 2 | 4253 | 1029 | 1374 | 10980 | 2200 |
| 3 | 2189 | 610 | 725 | 8317 | 1883 |
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
| groupkm | -0.0108046746319754 | 0.0639753139202767 |
|---|---|---|
| 1 | 6860 | 449 |
| 2 | 16881 | 2955 |
| 3 | 11338 | 2386 |